Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggcorrplot)
library(grid)
library(vcd)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:caret':
##
## R2
##
## The following object is masked from 'package:stats':
##
## loadings
library(PRROC)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
Load data
df = expectancy = read.csv('~/Downloads/linear_models_project_data.csv')
head(df)
## X age workclass fnlwgt education education.num marital.status
## 1 0 39 State-gov 77516 Bachelors 13 Never-married
## 2 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 2 38 Private 215646 HS-grad 9 Divorced
## 4 3 53 Private 234721 11th 7 Married-civ-spouse
## 5 4 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 5 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital.gain capital.loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
Drop missing values
df = replace(df, df=="NaN", NA)
df = df %>% drop_na()
df %>% head()
## X age workclass fnlwgt education education.num marital.status
## 1 0 39 State-gov 77516 Bachelors 13 Never-married
## 2 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 2 38 Private 215646 HS-grad 9 Divorced
## 4 3 53 Private 234721 11th 7 Married-civ-spouse
## 5 4 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 5 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital.gain capital.loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
Rename ‘income’ values
df = replace(df, df=="<=50K.", "<=50K")
df = replace(df, df==">50K.", ">50K")
df %>% head()
## X age workclass fnlwgt education education.num marital.status
## 1 0 39 State-gov 77516 Bachelors 13 Never-married
## 2 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 2 38 Private 215646 HS-grad 9 Divorced
## 4 3 53 Private 234721 11th 7 Married-civ-spouse
## 5 4 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 5 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital.gain capital.loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
Collapse education into bins
df$education = replace(df$education, df$education=="1st-4th", "Elementary/Middle")
df$education = replace(df$education, df$education=="5th-6th", "Elementary/Middle")
df$education = replace(df$education, df$education=="7th-8th", "Elementary/Middle")
df$education = replace(df$education, df$education=="Preschool", "Elementary/Middle")
df$education = replace(df$education, df$education=="Assoc-acdm", "Associates")
df$education = replace(df$education, df$education=="Assoc-voc", "Associates")
df$education = replace(df$education, df$education=="9th", "Highschool")
df$education = replace(df$education, df$education=="10th", "Highschool")
df$education = replace(df$education, df$education=="11th", "Highschool")
df$education = replace(df$education, df$education=="12th", "Highschool")
df$education = replace(df$education, df$education=="HS-grad", "Highschool")
head(df)
## X age workclass fnlwgt education education.num marital.status
## 1 0 39 State-gov 77516 Bachelors 13 Never-married
## 2 1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 2 38 Private 215646 Highschool 9 Divorced
## 4 3 53 Private 234721 Highschool 7 Married-civ-spouse
## 5 4 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 5 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital.gain capital.loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
How does the distribution of income vary across different age groups?
df$age_group <- cut(df$age,
breaks = c(0, 20, 30, 40, 50, 60, 70, 80, Inf),
labels = c("0-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81+"),
right = FALSE)
df %>%
group_by(income, age_group) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count)) %>%
ungroup() %>%
ggplot(aes(x = age_group, y = proportion, fill = income)) +
geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Proportion of Income by Age Group") + xlab("Age Group") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
What is the relationship between the number of hours worked per week and income level?
df2 <- df
names(df2)[names(df2) == 'income'] <- 'Income'
df2 %>% ggplot(aes(x=hours.per.week, y=Income, color=Income, fill=Income))+geom_boxplot() + ggtitle("Hours worked each week per Income Level") + xlab("Hours worked per week") + ylab("Income Level") + theme(plot.title = element_text(hjust = 0.5))
What is the relationship between the number of hours worked per week and income level?
df %>% ggplot(aes(x=hours.per.week, y=income, color=income))+geom_violin()
Converting income column to dummy variables, find correlation between education number and income greater than 50K
data_dummies = fastDummies::dummy_cols(df, select_columns="income")
cor(data_dummies$education.num, data_dummies$`income_>50K`)
## [1] 0.3327999
Income level depending on education level
df$education = factor(df$education, levels = c("Elementary/Middle", "Highschool", 'Associates', "Some-college", 'Bachelors','Prof-school', 'Masters', 'Doctorate'))
df %>%
group_by(income, education) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count)) %>%
ungroup() %>%
ggplot(aes(x = education, y = proportion, fill = income)) +
geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Proportion of Income by Education Level") + xlab("Education Level") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
Interpretation: As we can see from the above graph, the most amount of people who earns greater than 50K is people who have bachelors degree and this is because in our data we have more people who have bachelors degree than people who have Masters or Doctorate degree. Same idea applies to Highschool column, because we have more data on people whose highest level of education is highschool, that is why it looks people who earns less than 50K is in highschool column
Income level depending on occupation
df %>%
group_by(income, occupation) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count)) %>%
ungroup() %>%
ggplot(aes(x = occupation, y = proportion, fill = income)) +
geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Income Level proportion per Occupation") + xlab("Occupation") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
Interpretation: As we can see from the bar graph above, we can say that the most amount of people who earns above 50K is people who has executive/managerial positions. After that it comes people who has a position in Prof/specialty. These makes sense because in real life as well, Executive managers earn a lot of money and Professors who make a lot of research can earn a lot of money. On the other hand, the most amount of people who make less than 50K is people who work in clerical jobs. These might include data entring, answering phone calls, filling paperwork and etc.
Correlation Matrix between all the numerical variables.
cor_mat <- cor(df[,c('age',"education.num", 'capital.gain','capital.loss','hours.per.week')])
ggcorrplot(cor_mat,lab=TRUE, type='full')
Interpretation: As you can see in the correlation matrix, multicollinearity is not an issue becase there are no variables that are highly correlated with each other
What is the relationship between mean hours.per.week and age?
mean_df <- df %>%
group_by(age) %>%
summarise(mean_hours_per_week = mean(hours.per.week))
ggplot(mean_df, aes(x = mean_hours_per_week, y = age)) +
geom_point(color = "red") +
labs(title = "Scatterplot of Mean Hours Per Week vs. Age",
x = "Mean Hours Per Week",
y = "Age") +
theme_minimal()
What is the relationship between marital status and age?
ggplot(df, aes(x = marital.status, y = age, fill = marital.status)) +
geom_boxplot() +
labs(title = "Boxplot of Age by Marital Status",
x = "Marital Status",
y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
Examine distribution of capital loss and capital gain
value_counts <- table(df$capital.loss)
df_capital.loss_count <- data.frame(value_counts)
df_capital.loss_count$pct <- round(100*df_capital.loss_count$Freq / sum(df_capital.loss_count$Freq),2)
head(df_capital.loss_count)
## Var1 Freq pct
## 1 0 43082 95.27
## 2 155 1 0.00
## 3 213 5 0.01
## 4 323 5 0.01
## 5 419 1 0.00
## 6 625 17 0.04
value_counts2 <- table(df$capital.gain)
df_capital.gain_count <- data.frame(value_counts2)
df_capital.gain_count$pct <- round(100*df_capital.gain_count$Freq / sum(df_capital.gain_count$Freq),2)
head(df_capital.gain_count)
## Var1 Freq pct
## 1 0 41432 91.62
## 2 114 8 0.02
## 3 401 2 0.00
## 4 594 42 0.09
## 5 914 10 0.02
## 6 991 4 0.01
Interpretation: As we can see from the above tables, most of the people in the dataset has 0 capital loss and capital gain. This is because in real life, most of the people do not have capital loss or gain.
Association between education level and income
contingency_table = table(df$education, df$income)
chisq.test(contingency_table)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 5814.2, df = 7, p-value < 2.2e-16
Interpretation: As we can see from the Chi-squared test, education and income have a very high association because the chi-square value is extremely high and p value is really low. This means that we have to reject null hypothesis and conclude that there is a high association between education and income.
Association between age group and income
contingency_table = table(df$age_group, df$income)
chisq.test(contingency_table)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 4234.6, df = 7, p-value < 2.2e-16
Interpretation: As we can see from the Chi-squared test, age groups and income have a very high association because the chi-square value is extremely high and p value is really low. This means that we have to reject null hypothesis and conclude that there is a high association between age groups and income.
Train Test split by 80% training and 20% testing datasets
X = model.matrix(income~0+., data=df)
X = scale(X)
y= df$income
train_index = sample(1:nrow(df), 0.8 * nrow(df))
train_X = X[train_index, ]
train_y = y[train_index]
test_X = X[-train_index, ]
test_y = y[-train_index]
10 fold cross validation and finding best lambda for ‘income’ categorical variable
set.seed(123)
cv_model = cv.glmnet(x=train_X, y=train_y, alpha=1, family="binomial", type.measure = "class", nfolds = 10)
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 0.0001231903
Plot cross validation model
plot(cv_model)
Store names of predictor variables
coefficient_matrix = as.matrix(coef(cv_model))
non_zero_indices <- which(coefficient_matrix != 0)
non_zero_coefs <- coefficient_matrix[non_zero_indices, ]
names_of_predictors = rownames(coefficient_matrix)[non_zero_indices]
#cbind(rownames(coefficient_matrix)[non_zero_indices], coefficient_matrix[coefficient_matrix!=0])
Finalize logistic model using the ideal lambda value
final_model <- glmnet(train_X, train_y, alpha = 1, family = "binomial", lambda = best_lambda)
Cast test_y data as 1s and 0s
#cbind(predicted_classes, test_y)
test_y = ifelse(test_y == ">50K", 1, 0)
Make predictions, construct confusion matrix where 0 represents individuals who earn <= 50K and 1 represents individuals who earn > 50K
predictions <- predict(final_model, newx = test_X, type = "response")
#predicted_classes <- ifelse(predictions > 0.5, ">50K", "<=50K")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
confusion <- confusionMatrix(factor(predicted_classes), factor(test_y))
confusion_df <- data.frame(confusion$table)
confusion_df$Color <- with(confusion_df, ifelse(Prediction == Reference, "green", "red"))
ggplot(data = confusion_df, aes(x = factor(Prediction), y = factor(Reference), fill = Color)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black", vjust = 1, size = 10) +
scale_fill_manual(values = c("green" = "green", "red" = "red")) +
theme_minimal() +
labs(x = "Predicted", y = "Actual", fill = "Count") +
ggtitle("Confusion Matrix Heatmap") +
scale_y_discrete(limits = rev(levels(factor(confusion_df$Reference)))) +
theme(plot.title = element_text(hjust = 0.5))
Find R squared value of logistic model
print(paste("R squared value:", round(caret::R2(predicted_classes, test_y), 4)))
## [1] "R squared value: 0.3265"
Precision, Recall, F1
tn = confusion$table[1, 1]
tp = confusion$table[2, 2]
fp = confusion$table[2, 1]
fn = confusion$table[1, 2]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
# Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
print(paste("Precision:", round(precision,3)))
## [1] "Precision: 0.733"
print(paste("Recall: ", round(recall, 3)))
## [1] "Recall: 0.607"
print(paste("F1 Score: ", round(f1_score,3)))
## [1] "F1 Score: 0.664"
Calculate accuracy
accuracy <- mean(predicted_classes == test_y)
print(paste("Testing Accuracy:", round(100*accuracy, 3)))
## [1] "Testing Accuracy: 84.809"
Plot ROC curve figure 1
PRROC_obj <- roc.curve(scores.class0 = predicted_classes, weights.class0=test_y, curve=TRUE)
plot(PRROC_obj)
Plot ROC curve figure 2
roc = roc(test_y, predicted_classes)
## Setting levels: control = 0, case = 1
## Warning in roc.default(test_y, predicted_classes): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
roc_dat = data.frame(TPR=roc$sensitivities, FPR=1-roc$specificities)
ggplot(roc_dat, aes(x=FPR, y=TPR))+geom_line()
Train Test split by 80% training and 20% testing datasets
df3 <- df
df3$capital.gain <- sqrt(df$capital.gain)
df3$capital.loss <- sqrt(df$capital.loss)
# df3['over50'] <- df$age > 50
X = model.matrix(age~0+.-age_group, data=df3)
X = scale(X)
y= df3$age
train_index = sample(1:nrow(df3), 0.8 * nrow(df3))
train_X = X[train_index, ]
train_y = y[train_index]
test_X = X[-train_index, ]
test_y = y[-train_index]
ggplot(data = df3, aes(x = df3$hours.per.week, y = df3$age)) + geom_point()
## Warning: Use of `df3$hours.per.week` is discouraged.
## ℹ Use `hours.per.week` instead.
## Warning: Use of `df3$age` is discouraged.
## ℹ Use `age` instead.
10 fold cross validation and finding best lambda for ‘age’ numeric variable
set.seed(123)
cv_model = cv.glmnet(x=train_X, y=train_y, alpha=1, nfolds = 10)
best_lambda <- cv_model$lambda.1se
best_lambda
## [1] 0.08800013
Plot cross validation model
plot(cv_model)
Store names of predictor variables
coefficient_matrix = as.matrix(coef(cv_model))
non_zero_indices <- which(coefficient_matrix != 0)
non_zero_coefs <- coefficient_matrix[non_zero_indices, ]
names_of_predictors = rownames(coefficient_matrix)[non_zero_indices]
names_of_predictors
## [1] "(Intercept)" "X"
## [3] "workclassFederal-gov" "workclassPrivate"
## [5] "workclassSelf-emp-inc" "workclassSelf-emp-not-inc"
## [7] "workclassState-gov" "workclassWithout-pay"
## [9] "fnlwgt" "educationHighschool"
## [11] "educationAssociates" "educationSome-college"
## [13] "educationProf-school" "educationMasters"
## [15] "educationDoctorate" "education.num"
## [17] "marital.statusMarried-AF-spouse" "marital.statusMarried-spouse-absent"
## [19] "marital.statusNever-married" "marital.statusSeparated"
## [21] "marital.statusWidowed" "occupationArmed-Forces"
## [23] "occupationCraft-repair" "occupationExec-managerial"
## [25] "occupationFarming-fishing" "occupationHandlers-cleaners"
## [27] "occupationMachine-op-inspct" "occupationOther-service"
## [29] "occupationPriv-house-serv" "occupationProtective-serv"
## [31] "occupationTech-support" "occupationTransport-moving"
## [33] "relationshipNot-in-family" "relationshipOther-relative"
## [35] "relationshipOwn-child" "relationshipWife"
## [37] "raceBlack" "raceOther"
## [39] "capital.gain" "capital.loss"
## [41] "hours.per.week" "native.countryCanada"
## [43] "native.countryCuba" "native.countryEl-Salvador"
## [45] "native.countryGreece" "native.countryGuatemala"
## [47] "native.countryHong" "native.countryHungary"
## [49] "native.countryIndia" "native.countryIran"
## [51] "native.countryItaly" "native.countryMexico"
## [53] "native.countryPhilippines" "native.countryPoland"
## [55] "native.countryScotland" "native.countryTaiwan"
## [57] "income>50K"
Finalize linear model using the ideal lambda value
final_model <- glmnet(train_X, train_y, alpha = 1, lambda = best_lambda)
Plot cut-off point of Lasso Regression
plot(glmnet(train_X, train_y, alpha = 1), xvar = "lambda")+abline(v=log(best_lambda))
## integer(0)
Predict ‘age’ using test set
predictions <- predict(final_model, newx = test_X, type = "response")
residuals <- predictions - test_y
RMSE, MAPE, R2
print(paste("Root Mean Squared Error:", round(RMSE(predictions, test_y), 3)))
## [1] "Root Mean Squared Error: 10.296"
print(paste("Mean Absolute Error:", round(MAE(predictions, test_y), 3)))
## [1] "Mean Absolute Error: 8.093"
print(paste("Mean Squared Error:", round(mean((predictions - test_y)^2), 3)))
## [1] "Mean Squared Error: 106.017"
print(paste("Mean Absolute Percentage Error:", round(mean(abs((test_y-predictions)/test_y))*100,3)))
## [1] "Mean Absolute Percentage Error: 22.492"
print(paste("R squared:", round(caret::R2(predictions, test_y),5)))
## [1] "R squared: 0.39907"
*** Linearity Assumptions
predres <- data.frame(predictions = predictions, residuals = residuals)
ggplot(data = predres, aes(x=predictions, y=residuals)) + geom_point() +
geom_hline(yintercept = 0, color = 'red')
ggplot(predres, aes(sample=residuals))+stat_qq()+stat_qq_line(color='red')
PCA with categorical variables converted to factors
train_index = sample(1:nrow(df), 0.8 * nrow(df))
df2 <- df[,-c(17)]
train = df2[train_index,]
test = df2[train_index,]
pcr_model <- pcr(age~., data = train,scale = T)
summary(pcr_model)
## Data: X dimension: 36177 89
## Y dimension: 36177 1
## Fit method: svdpc
## Number of components considered: 89
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 4.181 7.17 9.989 12.15 14.09 15.99 17.70 19.37
## age 11.355 13.02 14.901 21.91 22.38 26.58 26.61 30.42
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 21.00 22.52 23.97 25.37 26.71 28.04 29.34
## age 30.49 30.56 30.58 30.95 31.07 31.17 31.78
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 30.63 31.91 33.18 34.43 35.67 36.90 38.10
## age 31.83 31.85 31.96 31.98 32.09 32.15 32.19
## 23 comps 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## X 39.30 40.50 41.67 42.84 44.00 45.16 46.31
## age 32.22 32.26 32.44 33.83 33.84 34.06 34.06
## 30 comps 31 comps 32 comps 33 comps 34 comps 35 comps 36 comps
## X 47.46 48.59 49.73 50.86 51.99 53.12 54.24
## age 34.12 34.14 34.36 34.37 34.38 34.44 34.44
## 37 comps 38 comps 39 comps 40 comps 41 comps 42 comps 43 comps
## X 55.37 56.50 57.62 58.75 59.87 61.00 62.12
## age 34.45 34.45 34.46 34.46 34.46 34.46 34.46
## 44 comps 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 63.25 64.37 65.50 66.62 67.74 68.87 69.99
## age 34.46 34.47 34.48 34.48 34.48 34.53 34.55
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps 57 comps
## X 71.11 72.22 73.34 74.46 75.57 76.67 77.78
## age 34.56 34.63 34.64 34.68 34.68 34.80 35.21
## 58 comps 59 comps 60 comps 61 comps 62 comps 63 comps 64 comps
## X 78.87 79.96 81.05 82.14 83.22 84.29 85.35
## age 35.25 35.26 35.27 35.34 35.39 35.74 35.75
## 65 comps 66 comps 67 comps 68 comps 69 comps 70 comps 71 comps
## X 86.39 87.43 88.45 89.46 90.47 91.46 92.40
## age 35.80 35.85 35.86 35.90 36.19 36.19 37.39
## 72 comps 73 comps 74 comps 75 comps 76 comps 77 comps 78 comps
## X 93.33 94.21 95.07 95.87 96.63 97.37 97.97
## age 37.47 37.59 37.78 37.92 37.92 38.49 38.50
## 79 comps 80 comps 81 comps 82 comps 83 comps 84 comps 85 comps
## X 98.54 98.95 99.25 99.49 99.72 99.86 99.93
## age 38.56 39.17 39.17 39.57 40.81 40.95 40.95
## 86 comps 87 comps 88 comps 89 comps
## X 99.98 99.99 100.00 100.00
## age 40.98 41.17 41.28 41.28
RMSE
predictions <- predict(pcr_model, newdata = train, ncomp = 75)
residuals <- predictions - test$age
rmse <- sqrt(mean((test$age - predictions)^2))
rmse
## [1] 10.40209
*** Linearity Assumptions
predres <- data.frame(predictions = predictions, residuals = residuals)
ggplot(data = predres, aes(x=predictions, y=residuals)) + geom_point() +
geom_hline(yintercept = 0, color = 'red')
ggplot(predres, aes(sample=residuals))+stat_qq()+stat_qq_line(color='red')
Random Forest model
rf.fit <- randomForest(train_X[,-1], train_y, ntree = 500, mtry = 10)
rf.fit
##
## Call:
## randomForest(x = train_X[, -1], y = train_y, ntree = 500, mtry = 10)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 94.39412
## % Var explained: 45.84
predictions <- predict(rf.fit, test_X)
rmse <- sqrt(mean((predictions - test_y)^2))
print(paste("Root Mean Squared Error:", rmse))
## [1] "Root Mean Squared Error: 9.84802586715378"
varImpPlot(rf.fit)
results <- data.frame(Actual = test_y, Predicted = predictions)
# Plot predicted vs actual values using ggplot2
ggplot(results, aes(x=Predicted, y=Actual)) +
geom_point() +
geom_abline(slope=1, intercept=0, color="red") +
labs(title="Actual vs Predicted Age", x="Predicted Age", y="Actual Age") +
geom_smooth(method = 'lm', color = 'blue') +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Random forest model log transform response
library(randomForest)
train_y = log(train_y)
test_y = log(test_y)
rf.fit <- randomForest(train_X[,-1], train_y, ntree = 500, mtry = 10)
rf.fit
##
## Call:
## randomForest(x = train_X[, -1], y = train_y, ntree = 500, mtry = 10)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 0.05966574
## % Var explained: 51.34
predictions <- predict(rf.fit, test_X)
rmse <- sqrt(mean((predictions - test_y)^2))
rmse_orig <- sqrt(mean((exp(predictions) - exp(test_y))^2))
print(paste("Root Mean Squared Error:", rmse))
## [1] "Root Mean Squared Error: 0.247449244148933"
varImpPlot(rf.fit)
results <- data.frame(Actual = exp(test_y), Predicted = exp(predictions))
# Plot predicted vs actual values using ggplot2
ggplot(results, aes(x=Predicted, y=Actual)) +
geom_point() +
geom_abline(slope=1, intercept=0, color="red") +
labs(title="Actual vs Predicted Age", x="Predicted Age", y="Actual Age") +
geom_smooth(method = 'lm', color = 'blue') +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'